package servlets;

import java.io.InputStreamReader;
import java.io.Reader;
import java.net.URL;
import java.util.regex.*;

import javax.swing.text.Element;
import javax.swing.text.ElementIterator;
import javax.swing.text.html.HTML;
import javax.swing.text.html.HTMLDocument;
import javax.swing.text.html.HTMLEditorKit;

/**
 * Extract all "post" of from an HTML document of RTBF.
 */
public class TestParseo {
	public static void main(String[] argv) throws Exception {
		URL url = new URL("http://www.rtbf.be/culture/categorie/cinema");
		HTMLEditorKit kit = new HTMLEditorKit();
		HTMLDocument doc = (HTMLDocument) kit.createDefaultDocument();
		doc.putProperty("IgnoreCharsetDirective", Boolean.TRUE);
		Reader HTMLReader = new InputStreamReader(url.openConnection()
				.getInputStream());
		kit.read(HTMLReader, doc, 0);

		// Get an iterator for all HTML tags.
		ElementIterator it = new ElementIterator(doc);
		Element elem;

		while ((elem = it.next()) != null) {
			if (elem.getName().equals("div")) {
				String s = (String) elem.getAttributes().getAttribute(
						HTML.Attribute.ID);
				// Regular expression for match posts with "post-xxxxx"
				Pattern p = Pattern.compile("post-\\d{5}");
				Matcher m = p.matcher(s);
				if (m.find())
					System.out.println(s);
//				if(s.equals("post-35579")){
//					System.out.println(s);
//				}
			}
		}
		System.exit(0);
	}
}